In [1]:
import scipy as sp
import matplotlib.pylab as plt
import seaborn as sea
import pandas as pd
%pylab inline
In [2]:
from example import read_games
recs = list(read_games())
In [3]:
from collections import defaultdict
topkeys = defaultdict(int)
for rec in recs:
for k in rec['data'].keys():
topkeys[k] += 1
print(list(sorted(topkeys.keys())))
In [4]:
for ky,tot in sorted(topkeys.items()):
count = 0
uniq = set()
mn, mx = None, None
for rec in recs:
val = rec['data'].get(ky, None)
if not val:
continue
count += 1
if isinstance(val, dict):
val = "<DICT>"
elif isinstance(val, list):
val = "<LIST>"
else:
val = str(val).strip()
uniq.add(val)
if mn is None or val < mn:
mn = val
if mx is None or val > mx:
mx = val
print("%-25s %8d %8d %20s %20s" % (ky, tot, len(uniq), str(mn)[:20], str(mx)[:20]))
In [5]:
from random import choice
from collections import Counter
from pprint import PrettyPrinter
pp = PrettyPrinter(width=120)
pprint = pp.pprint
def fields(name):
for rec in recs:
v = rec['data'].get(name, None)
if v:
yield v
field_values = list(fields('support_info'))
def out(v):
pprint(v)
print("")
out(field_values[0])
out(field_values[-1])
for _ in range(10):
out(choice(field_values))
# def scan_values():
# for fv in field_values:
# free, purchase, subscript = False, False, False
# for pg in fv:
# if pg.get('is_recurring_subscription', '') == 'true':
# subscript = True
# for s in pg.get('subs', list()):
# if s.get('is_free_license', None):
# free = True
# elif s.get('price_in_cents_with_discount', 0) > 0:
# purchase = True
# yield (free, purchase, subscript)
# pprint(Counter(list(scan_values())))
# pprint(Counter([k for fv in field_values for k in fv.keys()]))
# pprint(Counter([i.get('description', 'MISSING').lower() for fv in field_values for i in fv]))
Each of these should be boolean column (from list of dict's, check 'description')
Like categories, it's a list of dicts. Use the dict['description'] for text and discretize:
If description is empty string, then ignore it.
All descriptions not described below are GenreIsOther
These descriptions become GenreIsXXX:
These description become GenreIsNotGame:
Applied to linux_requirements, mac_requirements, and pc_requirements
Value is dictionary with possible keys:
Turn into boolean, i.e.
3 boolean columns: FreeVerAvail, PurchaseAvail, SubscriptionAvail
Code for all:
def package_groups(rec):
pgs = rec.get("data", dict()).get("package_groups", list)
free, purchase, subscript = False, False, False
for pg in pgs:
if pg.get('is_recurring_subscription', '') == 'true':
subscript = True
for s in pg.get('subs', list()):
if s.get('is_free_license', None):
free = True
elif s.get('price_in_cents_with_discount', 0) > 0:
purchase = True
return (free, purchase, subscript)
In [6]:
# Attempt to read our shiny CSV file
gf = pd.read_csv('games-features.csv')
print(gf.columns)
gf.head()
Out[6]:
In [7]:
def invest(cname):
print(cname, " > 0")
nums = gf[gf[cname] > 0][cname]
print(nums.describe())
plt.figure(figsize=(8,6))
sea.distplot(nums)
print("")
invest("DemoCount")
invest("DLCCount")
invest("RecommendationCount")
In [9]:
gf["PCMinReqsText"]
Out[9]: